rm(list = ls())
library(tidyverse)
library(quanteda)
library(quanteda.textstats)
library(scales)
library(ggrepel)
source("fightin_words.R")
library(haven)

load("../working/survey.Rdata")

# Time spent on pre-amble to each question

boot_median_sd <- function(x, B = 10){
  
  sd(sapply(1:B, function(y) median(x[sample(1:length(x), replace = TRUE)], na.rm = TRUE)))
  
}

just_w1 %>%
  select(treat, contains("_text") & contains("duration")) %>%
  pivot_longer(-treat) %>%
  group_by(treat, name) %>%
  summarise(est = median(value, na.rm = TRUE),
            sd = boot_median_sd(value, B = 500),
            lo = est - 1.96 * sd,#t.test(value)$conf[1],
            hi =  est + 1.96 * sd) %>% #t.test(value)$conf[2]) %>%
  mutate(name = case_when(grepl("mw_",name )~ "Minimum Wage",
                          grepl("speech_",name )~ "Offensive Speech",
                          grepl("tax_",name )~ "High Tax",
                          grepl("unemp_",name )~ "Unemployment Support",
                          grepl("zero",name )~ "Zero Hours Contracts",
                          grepl("trans",name )~ "Transgender Rights"),
         name = factor(name, levels = c("Minimum Wage", "Offensive Speech", "Zero Hours Contracts", "Unemployment Support", "High Tax", "Transgender Rights"))) %>%
  ggplot(aes(x = est, xmin = lo, xmax = hi, y = name, col = treat)) + geom_pointrange() + 
  theme_bw() + xlab("Number of seconds") + ylab("") + 
  scale_color_manual("", values = c("black", "gray")) + 
  theme(legend.position = "bottom")

ggsave("../out/outcomes/attention_in_seconds.pdf", width = 6, height = 3)


just_w1 %>%
  select(treat, contains("_text") & contains("duration")) %>%
  pivot_longer(-treat) %>%
  group_by(treat, name) %>%
  filter(treat == "Treatment" & !is.na(value)) %>%
  mutate(value = cut(value, breaks = c(seq(0,300,15),1000), include.lowest = T),
         value = fct_recode(value, "(300+]" = "(300,1e+03]"),
         name = case_when(name == "duration_mw_text" ~ "Minimum Wage",
                          name == "duration_speech_text" ~ "Offensive Speech",
                          name == "duration_tax_text" ~ "High Tax",
                          name == "duration_trans_text" ~ "Transgender Rights",
                          name == "duration_unemp_text" ~ "Unemployment Support",
                          name == "duration_zero_text" ~ "Zero Hours Contracts")) %>%
  filter(!is.na(value)) %>%
  ggplot(aes(x = value)) + geom_bar() + 
  facet_wrap(~name) + 
  theme_bw() + 
  xlab("") + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + 
  ylab("")
ggsave("../out/outcomes/attention_in_seconds_binned_treatment.pdf", width = 10, height = 5)


# Number of words written on each issue in the treatment group

all_texts <- just_combined %>% select(match_id, treat, starts_with("just_") & !contains("against") & !contains("favour")) %>%
  filter(treat == "Treatment") %>%
  pivot_longer(-c(treat, match_id)) %>%
  filter(value != "")

all_texts_corpus <- corpus(all_texts, text_field = "value")
summary(ntoken(all_texts_corpus))

all_texts$ntokens <- ntoken(all_texts_corpus)

all_texts %>%
  group_by(match_id) %>%
  summarise(ntokens = mean(ntokens)) %>%
  full_join(just_combined[,c("match_id", "attention")]) %>%
  ggplot(aes(x = attention, y = ntokens)) + geom_point(alpha = .2)

all_texts %>% group_by(name) %>% summarise(ntokens = mean(ntokens))

# Predicting issue positions from texts


just_topup <- just_topup %>% 
  mutate(high_tax = high_tax_w2,
         zero_hours = zero_hours_w2,
         unemployment_support = unemployment_support_w2,
         trans_rights = trans_rights_w2,
         offensive_speech = offensive_speech_w2,
         minimum_wage = minimum_wage_w2,
         
         just_high_tax = just_high_tax_w2,
         just_zero_hours = just_zero_hours_w2,
         just_unemployment_support = just_unemployment_support_w2,
         just_trans_rights = just_trans_rights_w2,
         just_offensive_speech = just_offensive_speech_w2,
         just_minimum_wage = just_minimum_wage_w2,
         
         )

just_all <- bind_rows(just_topup, just_w1)

fightin_words <- function(var_name, cutoff, title){
  
  just_all$high_tax_increase <- factor(ifelse(just_all[[var_name]] > cutoff, "Increase", "Don't increase"), levels = rev(c("Don't increase", "Increase")))
  tax_corpus <- corpus(just_all, text_field = paste0("just_", var_name))
  
  my_dfm <- tax_corpus %>% tokens(remove_punct = T, remove_symbols = T) %>% tokens_remove(stopwords("en")) %>% tokens_ngrams(1:2) %>% dfm() %>% dfm_trim(min_termfreq = 5) %>% dfm_trim(max_docfreq = .1, docfreq_type = "prop")  
  
  tmp <- my_dfm %>% dfm_group(high_tax_increase) %>% textstat_keyness(measure = "chi")
  
  tmp$n <- tmp$n_target + tmp$n_reference
  
  tmp$chi2_2 <- ifelse(tmp$chi2 < 0, -1*sqrt(abs(tmp$chi2)), sqrt(tmp$chi2))
  
  tmp$chi2_2 <- rank(tmp$chi2_2)
  
  fwgroups_out <- fwgroups(my_dfm, docvars(my_dfm)$high_tax_increase)
  
  fw_scores <- data.frame(zeta = t(fwgroups_out$zeta)[,1])
  
  tmp$fw <- fw_scores[match(tmp$feature,rownames(fw_scores)),1]
  
  tmp$y <- tmp$fw
  
  n_to_plot <- 30
  tmp$plot_words <- FALSE
  tmp$plot_words[order(tmp$y, decreasing = T)[1:n_to_plot]] <- TRUE
  tmp$plot_words[order(tmp$y, decreasing = F)[1:n_to_plot]] <- TRUE
  
  lim <- ceiling(max(abs(tmp$y), na.rm = T))
  
  ggplot(tmp, aes(x = log(n), y = y, label = feature, col = y)) + 
    geom_point(alpha = .3) + 
    geom_text_repel(aes(label = feature, cex = sqrt(abs(y))), point.padding=.05,
                    box.padding = unit(0.20, "lines"), show.legend=F,
                    max.overlaps = 100,
                    data = tmp[tmp$plot_words,]) +
    theme_bw() +
    scale_color_gradient2("", low = muted("red"), mid = "gray", high = muted("blue"), guide = "none", space = "Lab", limits = range(tmp$y, na.rm = TRUE)) + 
    xlab("Log frequency of word") + 
    ylab("Z-score") + 
    ggtitle(title) + 
    scale_y_continuous(breaks = c(seq(-lim, lim, 2)), limits = c(-lim,lim))

  
  ggsave(paste0("../out/outcomes/words/",var_name,".pdf"), width = 5, height = 5) 
  
}

  
fightin_words("high_tax", 3, "Increase High Tax")
fightin_words("minimum_wage", 3, "Increase Minimum Wage")
fightin_words("unemployment_support", 4, "Increase Unemployment Support")
fightin_words("zero_hours", 4, "Ban Zero Hours Contracts")
fightin_words("trans_rights", 3, "Improve Transgender Rights")
fightin_words("offensive_speech", 1, "Ban Offensive Speech")


